Add specs for the extraction type `xml`.

Akinori MUSHA 10 years ago
parent
commit
2bb89c4e39
2 changed files with 65 additions and 0 deletions
  1. 2 0
      app/models/agents/website_agent.rb
  2. 63 0
      spec/models/agents/website_agent_spec.rb

+ 2 - 0
app/models/agents/website_agent.rb

@@ -33,6 +33,8 @@ module Agents
33 33
 
34 34
       "@_attr_" is the XPath expression to extract the value of an attribute named _attr_ from a node, and ".//text()" is to extract all the enclosed texts.  You can also use [XPath functions](http://www.w3.org/TR/xpath/#section-String-Functions) like `normalize-space` to strip and squeeze whitespace, `substring-after` to extract part of a text, and `translate` to remove comma from a formatted number, etc.  Note that these functions take a string, not a node set, so what you may think would be written as `normalize-space(.//text())` should actually be `normalize-space(.)`.
35 35
 
36
+      Beware that when parsing an XML document (i.e. `type` is `xml`) using `xpath` expressions all namespaces are stripped from the document.
37
+
36 38
       When parsing JSON, these sub-hashes specify [JSONPaths](http://goessner.net/articles/JsonPath/) to the values that you care about.  For example:
37 39
 
38 40
           "extract": {

+ 63 - 0
spec/models/agents/website_agent_spec.rb

@@ -368,6 +368,69 @@ describe Agents::WebsiteAgent do
368 368
         expect(event.payload['response_info']).to eq('The reponse was 200 OK.')
369 369
       end
370 370
 
371
+      describe "XML" do
372
+        before do
373
+          stub_request(:any, /github_rss/).to_return(
374
+            body: File.read(Rails.root.join("spec/data_fixtures/github_rss.atom")),
375
+            status: 200
376
+          )
377
+
378
+          @checker = Agents::WebsiteAgent.new(name: 'github', options: {
379
+            'name' => 'GitHub',
380
+            'expected_update_period_in_days' => '2',
381
+            'type' => 'xml',
382
+            'url' => 'http://example.com/github_rss.atom',
383
+            'mode' => 'on_change',
384
+            'extract' => {
385
+              'title' => { 'xpath' => '/feed/entry', 'value' => 'normalize-space(./title)' },
386
+              'url' => { 'xpath' => '/feed/entry', 'value' => './link[1]/@href' },
387
+              'thumbnail' => { 'xpath' => '/feed/entry', 'value' => './thumbnail/@url' },
388
+            }
389
+          }, keep_events_for: 2)
390
+          @checker.user = users(:bob)
391
+          @checker.save!
392
+        end
393
+
394
+        it "works with XPath" do
395
+          expect {
396
+            @checker.check
397
+          }.to change { Event.count }.by(20)
398
+          event = Event.last
399
+          expect(event.payload['title']).to eq('Shift to dev group')
400
+          expect(event.payload['url']).to eq('https://github.com/cantino/huginn/commit/d465158f77dcd9078697e6167b50abbfdfa8b1af')
401
+          expect(event.payload['thumbnail']).to eq('https://avatars3.githubusercontent.com/u/365751?s=30')
402
+        end
403
+
404
+        it "works with CSS selectors" do
405
+          @checker.options['extract'] = {
406
+            'title' => { 'css' => 'feed > entry', 'value' => 'normalize-space(./title)' },
407
+            'url' => { 'css' => 'feed > entry', 'value' => './link[1]/@href' },
408
+            'thumbnail' => { 'css' => 'feed > entry', 'value' => './thumbnail/@url' },
409
+          }
410
+          @checker.save!
411
+          expect {
412
+            @checker.check
413
+          }.to change { Event.count }.by(20)
414
+          event = Event.last
415
+          expect(event.payload['title']).to be_empty
416
+          expect(event.payload['thumbnail']).to be_empty
417
+
418
+          @checker.options['extract'] = {
419
+            'title' => { 'css' => 'feed > entry', 'value' => 'normalize-space(./xmlns:title)' },
420
+            'url' => { 'css' => 'feed > entry', 'value' => './xmlns:link[1]/@href' },
421
+            'thumbnail' => { 'css' => 'feed > entry', 'value' => './media:thumbnail/@url' },
422
+          }
423
+          @checker.save!
424
+          expect {
425
+            @checker.check
426
+          }.to change { Event.count }.by(20)
427
+          event = Event.last
428
+          expect(event.payload['title']).to eq('Shift to dev group')
429
+          expect(event.payload['url']).to eq('https://github.com/cantino/huginn/commit/d465158f77dcd9078697e6167b50abbfdfa8b1af')
430
+          expect(event.payload['thumbnail']).to eq('https://avatars3.githubusercontent.com/u/365751?s=30')
431
+        end
432
+      end
433
+
371 434
       describe "JSON" do
372 435
         it "works with paths" do
373 436
           json = {